sp_tree <- tree %>%
  ggtree(aes(color = supergroup),
    linewidth = 1.0,
    layout = "rectangular"
  ) %<+% tree_meta$tips +
  geom_tiplab(align = TRUE) +
  scale_color_manual(values = SUPERGROUP_COLS) +
  geom_treescale() +
  scale_x_continuous(expand = expansion(0.2)) +
  scale_y_tree()
## Warning: `expand_scale()` was deprecated in ggplot2 3.3.0.
## ℹ Please use `expansion()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
# Adjust the second plot to align with the tree and facet grid titles properly

hm_for_tree_id <- ggtreeplot(sp_tree, og_by_id_meta, data_label = "id", expand_limits = expand_scale(0, 1.2)) +
  geom_tile(aes(x = "Mean COGEQC hscore scaled vs all", fill = cogeqc_hscore_scaled_all_mean)) +
  scale_fill_gradient2(low = "#2166AC", high = "#B2182B", mid = "#F7F7F7", midpoint = median(og_by_id_meta$cogeqc_hscore_scaled_all_mean, na.rm = TRUE), guide = guide_colorbar(order = 1)) +
  
  new_scale_fill() +
  geom_tile(aes(x = "Mean OG size", fill = total_parent_seqs_mean)) +
  scale_fill_gradient2(low = "#FCFBFD", high = "#3F007D", guide = guide_colorbar(order = 2)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs including species", fill = num_orthogroups)) +
  scale_fill_gradient2(low = "#EFF3FF", high = "#08519C", guide = guide_colorbar(order = 3)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with < 10 genes that include species", fill = parent_seqs_below_10_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 4)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 10-50 genes that include species", fill = parent_seqs_from_10_to_50_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 5)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 50-100 genes that include species", fill = parent_seqs_from_50_to_100_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 6)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 100-200 genes that include species", fill = parent_seqs_from_100_to_200_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 7)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 200-500 genes that include species", fill = parent_seqs_from_200_to_500_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 8)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 500-1000 genes that include species", fill = parent_seqs_from_500_to_1000_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 9)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with 1000-10000 genes that include species", fill = parent_seqs_from_1000_to_10000_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 10)) +

  new_scale_fill() +
  geom_tile(aes(x = "Num of OGs with > 10000 genes that include species", fill = parent_seqs_over_10000_sum)) +
  scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 11)) +

  theme_bw() +
  theme(
    legend.title = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  facet_grid(. ~ OG_source, scales = "free_y", space = "free") +
  no_y_axis()


tree_hm_for_tree_id <- sp_tree + hm_for_tree_id +
  plot_annotation() +
  plot_layout(guides = "collect") &
  theme(legend.position = "bottom", legend.direction = "vertical")


## ggplot method


# plot the geom with y axis plotted using tree_y to reorder the y axis to match tree

hm_for_tree_id_gc <- ggtreeplot(sp_tree, og_by_id_gc_meta, aes(x = most_common_pfam_hmm_name), data_label = "id", expand_limits = expand_scale(0,1.2)) +
  geom_tile(aes(fill = cogeqc_hscore_scaled_all_mean)) +
  scale_fill_gradient2(
    low = "#2166AC",
    high = "#B2182B",
    mid = "#F7F7F7",
    midpoint = median(og_by_id_gc_meta$cogeqc_hscore_scaled_all_mean, na.rm = TRUE)
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(. ~ OG_source, scales = "free_y", space = "free") +
  no_y_axis()

# now to align the tree tips with the y axis of geom

tree_hm_for_tree_id_gc <- sp_tree + hm_for_tree_id_gc +
plot_annotation() +
plot_layout(guides = "collect")
# plot by tools
vn_cogeqc_hssc_tools <- og_meta %>% ggviolin(
    y = "cogeqc_hscore_scaled_all", x = "OG_source", 
    trim = TRUE, 
    add = "jitter",
    fill = "OG_source"
  ) +
    scale_fill_manual(values = OG_CALLER_COLS) +
    labs(y = "Scaled homogeneity scores", x = "Source of orthogroups",
         title = "Distribution of mean homogeneity scores for orthogroups") +
    theme(plot.subtitle = ggtext::element_markdown())

# plot by species, color tools
vn_cogeqc_hssc_sp <- og_meta %>%
separate_longer_delim(id, delim = ",") %>%
separate_longer_delim(supergroup, delim = ",") %>%
unique() %>%
mutate(id = factor(id, levels = unique(main_seq_long %>% arrange(supergroup) %>% pull(id)))) %>%
ggplot(aes(x = id, y = cogeqc_hscore_scaled_all, fill = OG_source)) +
  geom_violin(trim = TRUE, position = position_dodge(1)) +
  scale_fill_manual(values = OG_CALLER_COLS) +
  labs(y = "Scaled homogeneity scores", x = "Species",
       title = "Distribution of mean homogeneity scores for orthogroups") +
  theme(plot.subtitle = ggtext::element_markdown()) +
  theme_bw()

# hack to label axis with supergroup
vn_cogeqc_hssc_sp <- vn_cogeqc_hssc_sp + axis_lab_id_sg + 
plot_layout(ncol = 1, nrow = 2, heights = c(20, 0.5)) +
plot_layout(guides = "collect")
sp_ogs_coghs_vs_size_vs_sgs <- og_meta %>%
  ggplot(aes(x = og_size,
  y = cogeqc_hscore_scaled_all,
  size = total_supergroups,
  color = OG_source)) +
  geom_point(alpha=0.7) +
  scale_size(range = c(0, 10)) +
  scale_color_manual(values = OG_CALLER_COLS) +
  theme(legend.position = "none") +
  theme_bw()

vn_size_tools <- og_meta %>%
    ggviolin(y = "og_size", x = "OG_source", 
    trim = TRUE, 
    add = "jitter",
    fill = "OG_source"
  ) +
    scale_fill_manual(values = OG_CALLER_COLS) +
    labs(y = "Num genes in OG", x = "Source of orthogroups") +
    theme(plot.subtitle = ggtext::element_markdown()) +
    scale_y_continuous(trans = 'log10', labels = scales::comma)
# Create the heatmap using ggplot2
jaccard_metrics_heat_callers <- jaccard_metrics_all_long %>%
ggplot(aes(x = tool1, y = tool2, fill = Value)) +
  geom_tile() +
  scale_fill_gradient2(low = "#2166AC", high = "#B2182B", mid = "#F7F7F7", midpoint = 0.5) +
  facet_wrap(~Metric) +
  theme_minimal() +
  labs(
    title = "Heatmaps of Metrics by Tool Pair",
    x = "Tool 1", y = "Tool 2", fill = "Value"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(strip.text = element_text(size = 12)) +
  coord_fixed(ratio = 1)

# OG vs OG heatmap prep

hm_ogs_vs <- og_meta %>%
filter(OG_source == "orthofinder") %>%
mutate(Orthogroup = factor(Orthogroup, levels = unique(Orthogroup))) %>%
ggplot() +
  geom_tile(aes(x = "COGEQC hscore scaled vs all", y = Orthogroup, fill = cogeqc_hscore_scaled_all)) +
  scale_fill_gradient2(low = "#FFF5F0", high = "#67000D", limits = c(0, 1), midpoint = 0.25, guide = guide_colorbar(order = 5)) +

  new_scale_fill() +
  geom_tile(aes(x = "% of OG that is the most common Pfam", y = Orthogroup, fill = percent_most_common_pfam_hmm_name)) +
  scale_fill_gradient2(low = "#FCFBFD", high = "#3F007D", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 1)) +

  new_scale_fill() +
  geom_tile(aes(x = "OG size", y = Orthogroup, fill = og_size)) +
  scale_fill_gradient2(low = "#B2182B", high = "#2166AC", mid = "#F7F7F7", guide = guide_colorbar(order = 6)) +

  new_scale_fill() +
  geom_tile(aes(x = "Best JI vs ", y = Orthogroup, fill = Best_Jaccard)) +
  scale_fill_gradient2(low = "#762A83", high = "#1B7837", mid = "#F7F7F7", limits = c(0, 1), midpoint = 0.5, guide = guide_colorbar(order = 4)) +

  new_scale_fill() +
  geom_tile(aes(x = "OG source", y = Orthogroup, fill = OG_source)) +
  scale_fill_manual(values = OG_CALLER_COLS, guide = guide_legend(order = 7)) +

  new_scale_fill() +
  geom_tile(aes(x = "% species in OG", y = Orthogroup, fill = percent_total_ids)) +
    scale_fill_gradient2(low = "#F7FCF5", high = "#00441B", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 2)) +

  new_scale_fill() +
  geom_tile(aes(x = "% supergroups in OG", y = Orthogroup, fill = percent_total_supergroups)) +
  scale_fill_gradient2(low = "#EFF3FF", high = "#08519C", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 3)) +

  new_scale_fill() +
  geom_tile(aes(x = "vs OG source", y = Orthogroup, fill = vs_OG_source)) +
  scale_fill_manual(values = OG_CALLER_COLS, guide = guide_legend(order = 8)) +

  theme_bw() +
  theme(legend.position = "bottom",
  legend.direction = "vertical",
  legend.title = element_blank(),
  axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(most_common_pfam_hmm_name ~ ., scales = "free", space = "free")
up_in_ogs <- list()

# complex upset - tries to use column named id as data by default
# make matrices - for supergroups
up_in_ogs$sg <- create_presence_matrix(main_seq_long, supergroup, Orthogroup)
up_in_ogs$sg_cols <- colnames(up_in_ogs$sg)[-1]
up_in_ogs$sg <- up_in_ogs$sg %>% left_join(og_meta %>% rename(species = id), by = "Orthogroup")

# Prepare the colours for the sets, some R packages are so annoying!!!
queries_list <- purrr::map(names(SUPERGROUP_COLS), ~{
  upset_query(set = .x, color = SUPERGROUP_COLS[.x], fill = SUPERGROUP_COLS[.x])
}) %>%
purrr::keep(~ .x$set %in% up_in_ogs$sg_cols)


up_ogs_by_sg <- upset(
  up_in_ogs$sg,
  up_in_ogs$sg_cols,
  annotations = list(
    'OG size' = (
      ggplot(mapping = aes(y = og_size))
      + geom_jitter(aes(color = OG_source), na.rm = TRUE)
      + geom_violin(alpha = 0, na.rm = TRUE)
      + scale_y_continuous(trans = 'log10', labels = scales::comma)
      + scale_color_manual(values = OG_CALLER_COLS)
      ),
      'OG scaled Homogeneity Score' = (
          # note that aes(x=intersection) is supplied by default
          ggplot(mapping = aes(y = cogeqc_hscore_scaled_all))
          + geom_jitter(aes(color = OG_source), na.rm = TRUE)
          + geom_violin(alpha = 0, na.rm = TRUE)
          + scale_color_manual(values = OG_CALLER_COLS)
      ),
      'OG source' = (
          ggplot(mapping = aes(fill = up_in_ogs$sg$OG_source))
         + geom_bar(stat = 'count', position = 'fill')
         + scale_fill_manual(values = OG_CALLER_COLS)
         + ylab('OG source')
      )
  ),
  min_size = 10,
  width_ratio = 0.1,
  sort_intersections_by = 'degree',
  queries = queries_list
)


# make matrices - for geneclasses
up_in_ogs$gc <- create_presence_matrix(main_seq_long, pfam_hmm_name, Orthogroup)
up_in_ogs$gc_cols <- colnames(up_in_ogs$gc)[-1]
up_in_ogs$gc <- up_in_ogs$gc %>% left_join(og_meta %>% rename(species = id), by = "Orthogroup")

up_ogs_by_gc <- upset(
  up_in_ogs$gc,
  up_in_ogs$gc_cols,
  annotations = list(
    'OG size' = (
      ggplot(mapping = aes(y = og_size))
      + geom_jitter(aes(color = OG_source), na.rm = TRUE)
      + geom_violin(alpha = 0, na.rm = TRUE)
      + scale_y_continuous(trans = 'log10', labels = scales::comma)
      + scale_color_manual(values = OG_CALLER_COLS)
      ),
      'OG scaled Homogeneity Score' = (
          # note that aes(x=intersection) is supplied by default
          ggplot(mapping = aes(y = cogeqc_hscore_scaled_all))
          + geom_jitter(aes(color = OG_source), na.rm = TRUE)
          + geom_violin(alpha = 0, na.rm = TRUE)
          + scale_color_manual(values = OG_CALLER_COLS)
      ),
      'OG source' = (
          ggplot(mapping = aes(fill = up_in_ogs$gc$OG_source))
         + geom_bar(stat = 'count', position = 'fill')
         + scale_fill_manual(values = OG_CALLER_COLS)
         + ylab('OG source')
      )
  ),
  min_size = 5, # this is really important as it controls how many are shown
  width_ratio = 0.1,
  sort_intersections_by = 'degree'
)

1 Overall Comparison of OG callers

1.1 Orthogroup callers overall OG composition Similarity

jaccard_metrics_heat_callers


1.2 OG Homogeneity by OG callers

vn_cogeqc_hssc_tools
## Warning: Removed 489 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 489 rows containing missing values or values outside the scale range
## (`geom_point()`).


1.3 OG Homogeneity by Species

vn_cogeqc_hssc_sp
## Warning: Removed 809 rows containing non-finite outside the scale range
## (`stat_ydensity()`).


2 Ortholog Properties

2.1 Orthogroup statistics

hm_ogs_vs


2.2 Orthogroup homogeneity score vs OG size

sp_ogs_coghs_vs_size_vs_sgs
## Warning: Removed 489 rows containing missing values or values outside the scale range
## (`geom_point()`).


2.3 Orthogroup size by tool

vn_size_tools


3 Ortholog Distribution

3.1 Species tree

tree_hm_for_tree_id


3.2 Species tree by geneclass

tree_hm_for_tree_id_gc


3.3 OG distribution across supergroups

up_ogs_by_sg
## Warning: Removed 81 rows containing non-finite outside the scale range
## (`stat_count()`).


3.4 OG distribution across gene classes

up_ogs_by_gc
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_count()`).